Regression Analysis for Water Temperature Data

The goal of this analysis is to develop a regression to predict water temperature when data are missing or the time series is incomplete.

Currently this analysis is performed for Feather River and Yuba River. The resulting dataset with predicted values are saved and integrated in the development of a water temperature dataset.

Data used to build models: Butte Creek

Butte Creek is used to build the regression models because the time series is complete and the data are high quality.

Feather River

Data Preparation

  1. Pull in gage data from (GRL will represent the High Flow Channel (HFC) and FRA will represent the Low Flow Channel (LFC):
  • GRL (2003-03-05 to 2007-06-01 H; 2020-01-04 to present E): located after Thermalito Afterbay
  • FRA (2002-01-01 to present): located between Lake Oroville and Thermalito Afterbay
  1. Prepare datasets for regression analysis (dataset with no missing data to train and dataset with missing data to predict)

LFC

Plot of mean temp for Feather River LFC and Butte Creek

## `geom_smooth()` using formula = 'y ~ x'

Plot of min temp for Feather River LFC and Butte Creek

## `geom_smooth()` using formula = 'y ~ x'

Plot of max temp for Feather River LFC and Butte Creek

## `geom_smooth()` using formula = 'y ~ x'

Building Mean Regression

# LFC regression and predictions

# MEAN Regression
split <-rsample::initial_split(feather_lfc_regression_data_mean, prop = 0.8)
train <- rsample::training(split)
test <- rsample::testing(split)
feather_lfc_mod_mean <- lm(temp ~ date + butte_temp, data = train)
summary(feather_lfc_mod_mean)
## 
## Call:
## lm(formula = temp ~ date + butte_temp, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8395 -0.8432  0.0335  0.7871  5.9478 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.446e+00  1.154e+00  -2.986  0.00287 ** 
## date         5.728e-04  6.144e-05   9.323  < 2e-16 ***
## butte_temp   4.137e-01  6.247e-03  66.215  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.382 on 1565 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.7431, Adjusted R-squared:  0.7427 
## F-statistic:  2263 on 2 and 1565 DF,  p-value: < 2.2e-16
test_predict <- predict(feather_lfc_mod_mean, test)
test_predict_df <- test |>
  mutate(predicted = test_predict)
# evaluate predictions - MAPE of 10% is not bad
mean(abs((
  test_predict_df$predicted - test_predict_df$temp
)) / test_predict_df$temp)
## [1] 0.08329577

Building Min Regression

# MIN Regression
split <-rsample::initial_split(feather_lfc_regression_data_min, prop = 0.8)
train <- rsample::training(split)
test <- rsample::testing(split)
feather_lfc_mod_min <- lm(temp ~ date + butte_temp, data = train)
summary(feather_lfc_mod_min)
## 
## Call:
## lm(formula = temp ~ date + butte_temp, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6302 -0.8263  0.0339  0.7729  6.2951 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.265e+00  1.139e+00  -1.989   0.0469 *  
## date         5.116e-04  6.064e-05   8.436   <2e-16 ***
## butte_temp   3.855e-01  6.613e-03  58.293   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.347 on 1565 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.6922, Adjusted R-squared:  0.6918 
## F-statistic:  1760 on 2 and 1565 DF,  p-value: < 2.2e-16
test_predict <- predict(feather_lfc_mod_min, test)
test_predict_df <- test |>
  mutate(predicted = test_predict)
# evaluate predictions - MAPE of 10% is not bad
mean(abs((
  test_predict_df$predicted - test_predict_df$temp
)) / test_predict_df$temp)
## [1] 0.08777431

Building Max Regression

# MAX Regression
split <-rsample::initial_split(feather_lfc_regression_data_max, prop = 0.8)
train <- rsample::training(split)
test <- rsample::testing(split)
feather_lfc_mod_max <- lm(temp ~ date + butte_temp, data = train)
summary(feather_lfc_mod_max)
## 
## Call:
## lm(formula = temp ~ date + butte_temp, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.1963 -0.7564 -0.0088  0.7945  6.1494 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.414e+00  1.178e+00  -3.746 0.000186 ***
## date         6.159e-04  6.258e-05   9.842  < 2e-16 ***
## butte_temp   4.402e-01  5.806e-03  75.813  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.414 on 1565 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.7892, Adjusted R-squared:  0.7889 
## F-statistic:  2929 on 2 and 1565 DF,  p-value: < 2.2e-16
test_predict <- predict(feather_lfc_mod_max, test)
test_predict_df <- test |>
  mutate(predicted = test_predict)
# evaluate predictions - MAPE of 10% is not bad
mean(abs((
  test_predict_df$predicted - test_predict_df$temp
)) / test_predict_df$temp)
## [1] 0.08601363

Predictions

# Predictions
feather_gap_predicted_mean_lfc <- predict(feather_lfc_mod_mean, feather_gap_mean)
feather_lfc_mean_predicted <- feather_gap_mean |> 
  mutate(value = feather_gap_predicted_mean_lfc,
         statistic = "mean") |> 
  select(date, value, statistic)
ggplot(feather_lfc_mean_predicted, aes(x = date, y = value)) +
  geom_line()

feather_gap_predicted_min_lfc <- predict(feather_lfc_mod_min, feather_gap_min)
feather_lfc_min_predicted <- feather_gap_min |> 
  mutate(value = feather_gap_predicted_min_lfc,
         statistic = "min") |> 
  select(date, value, statistic)
ggplot(feather_lfc_min_predicted, aes(x = date, y = value)) +
  geom_line()

feather_gap_predicted_max_lfc <- predict(feather_lfc_mod_max, feather_gap_max)
feather_lfc_max_predicted <- feather_gap_max |> 
  mutate(value = feather_gap_predicted_max_lfc,
         statistic = "max") |> 
  select(date, value, statistic)
ggplot(feather_lfc_max_predicted, aes(x = date, y = value)) +
  geom_line()

Full dataset

feather_gap_lfc <- bind_rows(feather_lfc_max_predicted,
                             feather_lfc_mean_predicted,
                             feather_lfc_min_predicted) |> 
  mutate(stream = "feather river",
         site_group = "upper feather lfc") %>% 
  pivot_wider(id_cols = c(date, stream, site_group), values_from = "value", names_from = "statistic") |>
  rename(mean_i = mean,
         max_i = max,
         min_i = min) |> 
  full_join(feather_lfc_format |> 
              pivot_wider(id_cols = c(stream, date, gage_agency, gage_number, site_group), values_from = "value", names_from = "statistic")) |> 
  mutate(gage_agency = ifelse(is.na(mean), "interpolated", gage_agency),
         gage_number = ifelse(is.na(mean), "interpolated", gage_number),
         mean = ifelse(is.na(mean), mean_i, mean),
         min = ifelse(is.na(min), min_i, min),
         max = ifelse(is.na(max), max_i, max)) |>
  select(-c(min_i, mean_i, max_i)) |> 
  pivot_longer(cols = mean:min, values_to = "value", names_to = "statistic") |> 
  group_by(stream, date, statistic, gage_agency, gage_number, site_group) |> glimpse()
## Joining with `by = join_by(date, stream, site_group)`
## Rows: 26,367
## Columns: 7
## Groups: stream, date, statistic, gage_agency, gage_number, site_group [26,367]
## $ date        <date> 1999-12-31, 1999-12-31, 1999-12-31, 2000-01-01, 2000-01-0…
## $ stream      <chr> "feather river", "feather river", "feather river", "feathe…
## $ site_group  <chr> "upper feather lfc", "upper feather lfc", "upper feather l…
## $ gage_agency <chr> "interpolated", "interpolated", "interpolated", "interpola…
## $ gage_number <chr> "interpolated", "interpolated", "interpolated", "interpola…
## $ statistic   <chr> "mean", "max", "min", "mean", "max", "min", "mean", "max",…
## $ value       <dbl> 4.814357, 4.446810, 5.189872, 4.883876, 4.887606, 4.997627…
ggplot() +
  geom_line(data = feather_gap_lfc, aes(x = date, y = value, color = statistic)) +
  theme_minimal()
## Warning: Removed 3 rows containing missing values (`geom_line()`).

HFC

Plot of mean temp for Feather River HFC and Butte Creek

# plot for mean
ggplot(data = feather_hfc_regression_data_mean, aes(x = butte_temp, y = temp)) +
  geom_point() +
  stat_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

Plot of min temp for Feather River HFC and Butte Creek

# plot for mean
ggplot(data = feather_hfc_regression_data_min, aes(x = butte_temp, y = temp)) +
  geom_point() +
  stat_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

Plot of max temp for Feather River HFC and Butte Creek

# plot for mean
ggplot(data = feather_hfc_regression_data_max, aes(x = butte_temp, y = temp)) +
  geom_point() +
  stat_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

Building Mean Regression

# HFC regression and predictions

# MEAN Regression
split <-rsample::initial_split(feather_hfc_regression_data_mean, prop = 0.8)
train <- rsample::training(split)
test <- rsample::testing(split)
feather_hfc_mod_mean <- lm(temp ~ date + butte_temp, data = train)
summary(feather_hfc_mod_mean)
## 
## Call:
## lm(formula = temp ~ date + butte_temp, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.4405  -1.3154  -0.1361   1.4364   4.9478 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 15.9770748  2.4046439   6.644 5.00e-11 ***
## date        -0.0007853  0.0001889  -4.158 3.49e-05 ***
## butte_temp   0.6603000  0.0138146  47.797  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.218 on 1001 degrees of freedom
## Multiple R-squared:  0.6954, Adjusted R-squared:  0.6948 
## F-statistic:  1143 on 2 and 1001 DF,  p-value: < 2.2e-16
test_predict <- predict(feather_hfc_mod_mean, test)
test_predict_df <- test |>
  mutate(predicted = test_predict)
# evaluate predictions - MAPE of 10% is not bad
mean(abs((
  test_predict_df$predicted - test_predict_df$temp
)) / test_predict_df$temp)
## [1] NA

Building Min Regression

# MIN Regression
split <-rsample::initial_split(feather_hfc_regression_data_min, prop = 0.8)
train <- rsample::training(split)
test <- rsample::testing(split)
feather_hfc_mod_min <- lm(temp ~ date + butte_temp, data = train)
summary(feather_hfc_mod_min)
## 
## Call:
## lm(formula = temp ~ date + butte_temp, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.9841  -1.1224   0.0197   1.2451   4.3444 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 16.0944603  2.1503179   7.485 1.57e-13 ***
## date        -0.0008143  0.0001688  -4.824 1.62e-06 ***
## butte_temp   0.6780694  0.0135243  50.137  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.966 on 1000 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.7158, Adjusted R-squared:  0.7152 
## F-statistic:  1259 on 2 and 1000 DF,  p-value: < 2.2e-16
test_predict <- predict(feather_hfc_mod_min, test)
test_predict_df <- test |>
  mutate(predicted = test_predict)
# evaluate predictions - MAPE of 10% is not bad
mean(abs((
  test_predict_df$predicted - test_predict_df$temp
)) / test_predict_df$temp)
## [1] 0.1172097

Building Max Regression

# MAX Regression
split <-rsample::initial_split(feather_hfc_regression_data_max, prop = 0.8)
train <- rsample::training(split)
test <- rsample::testing(split)
feather_hfc_mod_max <- lm(temp ~ date + butte_temp, data = train)
summary(feather_hfc_mod_max)
## 
## Call:
## lm(formula = temp ~ date + butte_temp, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.1371  -1.3613  -0.2248   1.7535   5.8303 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 18.2663624  2.8496087   6.410 2.24e-10 ***
## date        -0.0009096  0.0002240  -4.061 5.26e-05 ***
## butte_temp   0.6100411  0.0144402  42.246  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.615 on 1000 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.641,  Adjusted R-squared:  0.6403 
## F-statistic: 892.9 on 2 and 1000 DF,  p-value: < 2.2e-16
test_predict <- predict(feather_hfc_mod_max, test)
test_predict_df <- test |>
  mutate(predicted = test_predict)
# evaluate predictions - MAPE of 10% is not bad
mean(abs((
  test_predict_df$predicted - test_predict_df$temp
)) / test_predict_df$temp)
## [1] 0.1330883

Predictions

# Predictions
feather_gap_predicted_mean_hfc <- predict(feather_hfc_mod_mean, feather_gap_mean)
feather_hfc_mean_predicted <- feather_gap_mean |> 
  mutate(value = feather_gap_predicted_mean_hfc,
         statistic = "mean") |> 
  select(date, value, statistic)
ggplot(feather_hfc_mean_predicted, aes(x = date, y = value)) +
  geom_line()

feather_gap_predicted_min_hfc <- predict(feather_hfc_mod_min, feather_gap_min)
feather_hfc_min_predicted <- feather_gap_min |> 
  mutate(value = feather_gap_predicted_min_hfc,
         statistic = "min") |> 
  select(date, value, statistic)
ggplot(feather_hfc_min_predicted, aes(x = date, y = value)) +
  geom_line()

feather_gap_predicted_max_hfc <- predict(feather_hfc_mod_max, feather_gap_max)
feather_hfc_max_predicted <- feather_gap_max |> 
  mutate(value = feather_gap_predicted_max_hfc,
         statistic = "max") |> 
  select(date, value, statistic)
ggplot(feather_hfc_max_predicted, aes(x = date, y = value)) +
  geom_line()

Full dataset

feather_gap_hfc <- bind_rows(feather_hfc_max_predicted,
                             feather_hfc_mean_predicted,
                             feather_hfc_min_predicted) |> 
  mutate(stream = "feather river",
         site_group = "upper feather hfc") %>% 
  pivot_wider(id_cols = c(date, stream, site_group), values_from = "value", names_from = "statistic") |>
  rename(mean_i = mean,
         max_i = max,
         min_i = min) |> 
  full_join(feather_hfc_format |> 
              pivot_wider(id_cols = c(stream, date, gage_agency, gage_number, site_group), values_from = "value", names_from = "statistic")) |> 
  mutate(gage_agency = ifelse(is.na(mean), "interpolated", gage_agency),
         gage_number = ifelse(is.na(mean), "interpolated", gage_number),
         mean = ifelse(is.na(mean), mean_i, mean),
         min = ifelse(is.na(min), min_i, min),
         max = ifelse(is.na(max), max_i, max)) |> 
  select(-c(min_i, mean_i, max_i)) |> 
  pivot_longer(cols = mean:min, values_to = "value", names_to = "statistic") |> 
  group_by(stream, date, statistic, gage_agency, gage_number, site_group) |> glimpse()
## Joining with `by = join_by(date, stream, site_group)`
## Rows: 26,403
## Columns: 7
## Groups: stream, date, statistic, gage_agency, gage_number, site_group [26,403]
## $ date        <date> 1999-12-31, 1999-12-31, 1999-12-31, 2000-01-01, 2000-01-0…
## $ stream      <chr> "feather river", "feather river", "feather river", "feathe…
## $ site_group  <chr> "upper feather hfc", "upper feather hfc", "upper feather h…
## $ gage_agency <chr> "interpolated", "interpolated", "interpolated", "interpola…
## $ gage_number <chr> "interpolated", "interpolated", "interpolated", "interpola…
## $ statistic   <chr> "mean", "max", "min", "mean", "max", "min", "mean", "max",…
## $ value       <dbl> 10.542370, 11.228641, 10.427630, 10.651635, 11.837773, 10.…
ggplot() +
  geom_line(data = feather_gap_hfc, aes(x = date, y = value, color = statistic)) +
  theme_minimal()
## Warning: Removed 3 rows containing missing values (`geom_line()`).

Yuba River

Data Prepartion

  1. Pull in gage data from YR7 CDEC gage, however, this only contains data from 2020 onwards. Temperature data collected along with RST data was originally used to fill the gap prior to 2020; however, due to inconsistencies in these data sources the resulting predicted mean values were lower than the min values as the RST data only has mean data.

  2. Prepare datasets for regression analysis (dataset with no missing data to train and dataset with missing data to predict)

Plot of mean temp for Yuba River and Butte Creek

# plot for mean
ggplot(data = yuba_regression_data_mean, aes(x = butte_temp, y = temp)) +
  geom_point() +
  stat_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

Plot of min temp for Yuba River and Butte Creek

# plot for mean
ggplot(data = yuba_regression_data_min, aes(x = butte_temp, y = temp)) +
  geom_point() +
  stat_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

Plot of max temp for Yuba River and Butte Creek

# plot for mean
ggplot(data = yuba_regression_data_max, aes(x = butte_temp, y = temp)) +
  geom_point() +
  stat_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

Building Mean Regression

# MEAN Regression
split <-rsample::initial_split(yuba_regression_data_mean, prop = 0.8)
train <- rsample::training(split)
test <- rsample::testing(split)
yuba_mod_mean <- lm(temp ~ date + butte_temp, data = train)
summary(yuba_mod_mean)
## 
## Call:
## lm(formula = temp ~ date + butte_temp, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9651 -1.0363  0.0472  1.0556  3.1283 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.199e+01  1.577e+00   13.94   <2e-16 ***
## date        -8.313e-04  8.292e-05  -10.03   <2e-16 ***
## butte_temp   5.941e-01  6.825e-03   87.04   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.404 on 1297 degrees of freedom
## Multiple R-squared:  0.8569, Adjusted R-squared:  0.8566 
## F-statistic:  3882 on 2 and 1297 DF,  p-value: < 2.2e-16
test_predict <- predict(yuba_mod_mean, test)
test_predict_df <- test |>
  mutate(predicted = test_predict)
# evaluate predictions - MAPE of 10% is not bad
mean(abs((
  test_predict_df$predicted - test_predict_df$temp
)) / test_predict_df$temp)
## [1] 0.08520138

Building Min Regression

# MIN Regression
split <-rsample::initial_split(yuba_regression_data_min, prop = 0.8)
train <- rsample::training(split)
test <- rsample::testing(split)
yuba_mod_min <- lm(temp ~ date + butte_temp, data = train)
summary(yuba_mod_min)
## 
## Call:
## lm(formula = temp ~ date + butte_temp, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9789 -0.9879  0.0217  1.0360  3.1786 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.298e+01  1.599e+00   14.37   <2e-16 ***
## date        -8.884e-04  8.418e-05  -10.55   <2e-16 ***
## butte_temp   5.471e-01  7.510e-03   72.85   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.418 on 1297 degrees of freedom
## Multiple R-squared:  0.8074, Adjusted R-squared:  0.8071 
## F-statistic:  2719 on 2 and 1297 DF,  p-value: < 2.2e-16
test_predict <- predict(yuba_mod_min, test)
test_predict_df <- test |>
  mutate(predicted = test_predict)
# evaluate predictions - MAPE of 10% is not bad
mean(abs((
  test_predict_df$predicted - test_predict_df$temp
)) / test_predict_df$temp)
## [1] 0.1006144

Building Max Regression

# MAX Regression
split <-rsample::initial_split(yuba_regression_data_max, prop = 0.8)
train <- rsample::training(split)
test <- rsample::testing(split)
yuba_mod_max <- lm(temp ~ date + butte_temp, data = train)
summary(yuba_mod_max)
## 
## Call:
## lm(formula = temp ~ date + butte_temp, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1868 -0.9594  0.0410  0.9962  6.3462 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.806e+01  1.571e+00  11.501  < 2e-16 ***
## date        -6.222e-04  8.236e-05  -7.554 7.93e-14 ***
## butte_temp   6.390e-01  6.232e-03 102.528  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.408 on 1297 degrees of freedom
## Multiple R-squared:  0.8923, Adjusted R-squared:  0.8922 
## F-statistic:  5375 on 2 and 1297 DF,  p-value: < 2.2e-16
test_predict <- predict(yuba_mod_max, test)
test_predict_df <- test |>
  mutate(predicted = test_predict)
# evaluate predictions - MAPE of 10% is not bad
mean(abs((
  test_predict_df$predicted - test_predict_df$temp
)) / test_predict_df$temp)
## [1] 0.07569647

Predictions

# Predictions
yuba_gap_predicted_mean <- predict(yuba_mod_mean, yuba_gap_mean)
yuba_mean_predicted <- yuba_gap_mean |> 
  mutate(value = yuba_gap_predicted_mean,
         statistic = "mean_predicted") |> 
  select(date, value, statistic)
ggplot(yuba_mean_predicted, aes(x = date, y = value)) +
  geom_line()
## Warning: Removed 1 row containing missing values (`geom_line()`).

yuba_gap_predicted_min <- predict(yuba_mod_min, yuba_gap_min)
yuba_min_predicted <- yuba_gap_min |> 
  mutate(value = yuba_gap_predicted_min,
         statistic = "min_predicted") |> 
  select(date, value, statistic)
ggplot(yuba_min_predicted, aes(x = date, y = value)) +
  geom_line()
## Warning: Removed 1 row containing missing values (`geom_line()`).

yuba_gap_predicted_max <- predict(yuba_mod_max, yuba_gap_max)
yuba_max_predicted <- yuba_gap_max |> 
  mutate(value = yuba_gap_predicted_max,
         statistic = "max_predicted") |> 
  select(date, value, statistic)
ggplot(yuba_max_predicted, aes(x = date, y = value)) +
  geom_line()
## Warning: Removed 1 row containing missing values (`geom_line()`).

Full dataset

yuba_gap <- bind_rows(yuba_max_predicted,
                      yuba_mean_predicted,
                      yuba_min_predicted) |> 
  mutate(stream = "yuba river") |> 
  pivot_wider(id_cols = c(date, stream), values_from = "value", names_from = "statistic") |> 
  full_join(yuba_format |> 
              pivot_wider(id_cols = c(stream, date, gage_agency, gage_number), values_from = "value", names_from = "statistic")) |> 
  mutate(gage_agency = ifelse(is.na(mean), "interpolated", gage_agency),
         gage_number = ifelse(is.na(mean), "interpolated", gage_number),
         mean = ifelse(is.na(mean), mean_predicted, mean),
         min = ifelse(is.na(min), min_predicted, min),
         max = ifelse(is.na(max), max_predicted, max)) |> 
  select(-c(mean_predicted, max_predicted, min_predicted)) |> 
    pivot_longer(cols = mean:min, values_to = "value", names_to = "statistic") |>
  group_by(stream, date, statistic, gage_agency, gage_number) |> glimpse()
## Joining with `by = join_by(date, stream)`
## Rows: 26,367
## Columns: 6
## Groups: stream, date, statistic, gage_agency, gage_number [26,367]
## $ date        <date> 1999-12-31, 1999-12-31, 1999-12-31, 2000-01-01, 2000-01-0…
## $ stream      <chr> "yuba river", "yuba river", "yuba river", "yuba river", "y…
## $ gage_agency <chr> "interpolated", "interpolated", "interpolated", "interpola…
## $ gage_number <chr> "interpolated", "interpolated", "interpolated", "interpola…
## $ statistic   <chr> "mean", "max", "min", "mean", "max", "min", "mean", "max",…
## $ value       <dbl> 15.73326, 14.31347, 15.87375, 15.83144, 14.95185, 15.59932…
ggplot() +
  geom_line(data = yuba_gap, aes(x = date, y = value, color = statistic)) +
  theme_minimal()
## Warning: Removed 3 rows containing missing values (`geom_line()`).